Data Exploration with Pandas



In [1]:

    
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np



In [2]:

    
df = pd.read_csv('./data/titanic-train.csv')



In [3]:

    
type(df)









    Out[3]:





pandas.core.frame.DataFrame



In [4]:

    
df.head()









    Out[4]:







  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      0
      1
      0
      3
      Braund, Mr. Owen Harris
      male
      22.0
      1
      0
      A/5 21171
      7.2500
      NaN
      S
    
    
      1
      2
      1
      1
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      female
      38.0
      1
      0
      PC 17599
      71.2833
      C85
      C
    
    
      2
      3
      1
      3
      Heikkinen, Miss. Laina
      female
      26.0
      0
      0
      STON/O2. 3101282
      7.9250
      NaN
      S
    
    
      3
      4
      1
      1
      Futrelle, Mrs. Jacques Heath (Lily May Peel)
      female
      35.0
      1
      0
      113803
      53.1000
      C123
      S
    
    
      4
      5
      0
      3
      Allen, Mr. William Henry
      male
      35.0
      0
      0
      373450
      8.0500
      NaN
      S



In [5]:

    
# Getting info about the DataFrame
df.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB



In [6]:

    
# Getting basic information about each column in the DataFrame
df.describe()









    Out[6]:







  
    
      
      PassengerId
      Survived
      Pclass
      Age
      SibSp
      Parch
      Fare
    
  
  
    
      count
      891.000000
      891.000000
      891.000000
      714.000000
      891.000000
      891.000000
      891.000000
    
    
      mean
      446.000000
      0.383838
      2.308642
      29.699118
      0.523008
      0.381594
      32.204208
    
    
      std
      257.353842
      0.486592
      0.836071
      14.526497
      1.102743
      0.806057
      49.693429
    
    
      min
      1.000000
      0.000000
      1.000000
      0.420000
      0.000000
      0.000000
      0.000000
    
    
      25%
      223.500000
      0.000000
      2.000000
      20.125000
      0.000000
      0.000000
      7.910400
    
    
      50%
      446.000000
      0.000000
      3.000000
      28.000000
      0.000000
      0.000000
      14.454200
    
    
      75%
      668.500000
      1.000000
      3.000000
      38.000000
      1.000000
      0.000000
      31.000000
    
    
      max
      891.000000
      1.000000
      3.000000
      80.000000
      8.000000
      6.000000
      512.329200

Indexing



In [7]:

    
df.iloc[3]









    Out[7]:





PassengerId                                               4
Survived                                                  1
Pclass                                                    1
Name           Futrelle, Mrs. Jacques Heath (Lily May Peel)
Sex                                                  female
Age                                                      35
SibSp                                                     1
Parch                                                     0
Ticket                                               113803
Fare                                                   53.1
Cabin                                                  C123
Embarked                                                  S
Name: 3, dtype: object



In [8]:

    
df.loc[0:4,'Ticket']









    Out[8]:





0           A/5 21171
1            PC 17599
2    STON/O2. 3101282
3              113803
4              373450
Name: Ticket, dtype: object



In [9]:

    
df['Ticket'].head()









    Out[9]:





0           A/5 21171
1            PC 17599
2    STON/O2. 3101282
3              113803
4              373450
Name: Ticket, dtype: object



In [10]:

    
df[['Embarked', 'Ticket']].head()









    Out[10]:







  
    
      
      Embarked
      Ticket
    
  
  
    
      0
      S
      A/5 21171
    
    
      1
      C
      PC 17599
    
    
      2
      S
      STON/O2. 3101282
    
    
      3
      S
      113803
    
    
      4
      S
      373450

Selections



In [11]:

    
# Selecting part of the DataFrame where value in the column 'Age' > 70
df[df['Age'] > 70]









    Out[11]:







  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      96
      97
      0
      1
      Goldschmidt, Mr. George B
      male
      71.0
      0
      0
      PC 17754
      34.6542
      A5
      C
    
    
      116
      117
      0
      3
      Connors, Mr. Patrick
      male
      70.5
      0
      0
      370369
      7.7500
      NaN
      Q
    
    
      493
      494
      0
      1
      Artagaveytia, Mr. Ramon
      male
      71.0
      0
      0
      PC 17609
      49.5042
      NaN
      C
    
    
      630
      631
      1
      1
      Barkworth, Mr. Algernon Henry Wilson
      male
      80.0
      0
      0
      27042
      30.0000
      A23
      S
    
    
      851
      852
      0
      3
      Svensson, Mr. Johan
      male
      74.0
      0
      0
      347060
      7.7750
      NaN
      S



In [12]:

    
df['Age'] > 70









    Out[12]:





0      False
1      False
2      False
3      False
4      False
5      False
6      False
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15     False
16     False
17     False
18     False
19     False
20     False
21     False
22     False
23     False
24     False
25     False
26     False
27     False
28     False
29     False
       ...  
861    False
862    False
863    False
864    False
865    False
866    False
867    False
868    False
869    False
870    False
871    False
872    False
873    False
874    False
875    False
876    False
877    False
878    False
879    False
880    False
881    False
882    False
883    False
884    False
885    False
886    False
887    False
888    False
889    False
890    False
Name: Age, Length: 891, dtype: bool



In [13]:

    
df.query("Age > 70")









    Out[13]:







  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      96
      97
      0
      1
      Goldschmidt, Mr. George B
      male
      71.0
      0
      0
      PC 17754
      34.6542
      A5
      C
    
    
      116
      117
      0
      3
      Connors, Mr. Patrick
      male
      70.5
      0
      0
      370369
      7.7500
      NaN
      Q
    
    
      493
      494
      0
      1
      Artagaveytia, Mr. Ramon
      male
      71.0
      0
      0
      PC 17609
      49.5042
      NaN
      C
    
    
      630
      631
      1
      1
      Barkworth, Mr. Algernon Henry Wilson
      male
      80.0
      0
      0
      27042
      30.0000
      A23
      S
    
    
      851
      852
      0
      3
      Svensson, Mr. Johan
      male
      74.0
      0
      0
      347060
      7.7750
      NaN
      S



In [14]:

    
df[(df['Age'] == 11) & (df['SibSp'] == 5)]









    Out[14]:







  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      59
      60
      0
      3
      Goodwin, Master. William Frederick
      male
      11.0
      5
      2
      CA 2144
      46.9
      NaN
      S



In [15]:

    
df[(df.Age == 11) | (df.SibSp == 5)]









    Out[15]:







  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      59
      60
      0
      3
      Goodwin, Master. William Frederick
      male
      11.0
      5
      2
      CA 2144
      46.9000
      NaN
      S
    
    
      71
      72
      0
      3
      Goodwin, Miss. Lillian Amy
      female
      16.0
      5
      2
      CA 2144
      46.9000
      NaN
      S
    
    
      386
      387
      0
      3
      Goodwin, Master. Sidney Leonard
      male
      1.0
      5
      2
      CA 2144
      46.9000
      NaN
      S
    
    
      480
      481
      0
      3
      Goodwin, Master. Harold Victor
      male
      9.0
      5
      2
      CA 2144
      46.9000
      NaN
      S
    
    
      542
      543
      0
      3
      Andersson, Miss. Sigrid Elisabeth
      female
      11.0
      4
      2
      347082
      31.2750
      NaN
      S
    
    
      683
      684
      0
      3
      Goodwin, Mr. Charles Edward
      male
      14.0
      5
      2
      CA 2144
      46.9000
      NaN
      S
    
    
      731
      732
      0
      3
      Hassan, Mr. Houssein G N
      male
      11.0
      0
      0
      2699
      18.7875
      NaN
      C
    
    
      802
      803
      1
      1
      Carter, Master. William Thornton II
      male
      11.0
      1
      2
      113760
      120.0000
      B96 B98
      S



In [16]:

    
df.query('(Age == 11) | (SibSp == 5)')









    Out[16]:







  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      59
      60
      0
      3
      Goodwin, Master. William Frederick
      male
      11.0
      5
      2
      CA 2144
      46.9000
      NaN
      S
    
    
      71
      72
      0
      3
      Goodwin, Miss. Lillian Amy
      female
      16.0
      5
      2
      CA 2144
      46.9000
      NaN
      S
    
    
      386
      387
      0
      3
      Goodwin, Master. Sidney Leonard
      male
      1.0
      5
      2
      CA 2144
      46.9000
      NaN
      S
    
    
      480
      481
      0
      3
      Goodwin, Master. Harold Victor
      male
      9.0
      5
      2
      CA 2144
      46.9000
      NaN
      S
    
    
      542
      543
      0
      3
      Andersson, Miss. Sigrid Elisabeth
      female
      11.0
      4
      2
      347082
      31.2750
      NaN
      S
    
    
      683
      684
      0
      3
      Goodwin, Mr. Charles Edward
      male
      14.0
      5
      2
      CA 2144
      46.9000
      NaN
      S
    
    
      731
      732
      0
      3
      Hassan, Mr. Houssein G N
      male
      11.0
      0
      0
      2699
      18.7875
      NaN
      C
    
    
      802
      803
      1
      1
      Carter, Master. William Thornton II
      male
      11.0
      1
      2
      113760
      120.0000
      B96 B98
      S

Unique Values



In [17]:

    
df['Embarked'].unique()









    Out[17]:





array(['S', 'C', 'Q', nan], dtype=object)

Sorting



In [18]:

    
# Sorting descending by column 'Age'
df.sort_values('Age', ascending = False).head()









    Out[18]:







  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      630
      631
      1
      1
      Barkworth, Mr. Algernon Henry Wilson
      male
      80.0
      0
      0
      27042
      30.0000
      A23
      S
    
    
      851
      852
      0
      3
      Svensson, Mr. Johan
      male
      74.0
      0
      0
      347060
      7.7750
      NaN
      S
    
    
      493
      494
      0
      1
      Artagaveytia, Mr. Ramon
      male
      71.0
      0
      0
      PC 17609
      49.5042
      NaN
      C
    
    
      96
      97
      0
      1
      Goldschmidt, Mr. George B
      male
      71.0
      0
      0
      PC 17754
      34.6542
      A5
      C
    
    
      116
      117
      0
      3
      Connors, Mr. Patrick
      male
      70.5
      0
      0
      370369
      7.7500
      NaN
      Q



In [19]:

    
# Sorting ascending by column 'Age'
df.sort_values('Age', ascending = True).head()









    Out[19]:







  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      803
      804
      1
      3
      Thomas, Master. Assad Alexander
      male
      0.42
      0
      1
      2625
      8.5167
      NaN
      C
    
    
      755
      756
      1
      2
      Hamalainen, Master. Viljo
      male
      0.67
      1
      1
      250649
      14.5000
      NaN
      S
    
    
      644
      645
      1
      3
      Baclini, Miss. Eugenie
      female
      0.75
      2
      1
      2666
      19.2583
      NaN
      C
    
    
      469
      470
      1
      3
      Baclini, Miss. Helene Barbara
      female
      0.75
      2
      1
      2666
      19.2583
      NaN
      C
    
    
      78
      79
      1
      2
      Caldwell, Master. Alden Gates
      male
      0.83
      0
      2
      248738
      29.0000
      NaN
      S

Aggregations



In [20]:

    
# Counting representation in the column
df['Survived'].value_counts()









    Out[20]:





0    549
1    342
Name: Survived, dtype: int64



In [21]:

    
df['Pclass'].value_counts()









    Out[21]:





3    491
1    216
2    184
Name: Pclass, dtype: int64



In [22]:

    
df.groupby(['Pclass', 'Survived'])['PassengerId'].count()









    Out[22]:





Pclass  Survived
1       0            80
        1           136
2       0            97
        1            87
3       0           372
        1           119
Name: PassengerId, dtype: int64



In [23]:

    
# Min
df['Age'].min()









    Out[23]:





0.41999999999999998



In [24]:

    
# Max
df['Age'].max()









    Out[24]:





80.0



In [25]:

    
# Mean
df['Age'].mean()









    Out[25]:





29.69911764705882



In [26]:

    
# Median
df['Age'].median()









    Out[26]:





28.0



In [27]:

    
mean_age_by_survived = df.groupby('Survived')['Age'].mean()
mean_age_by_survived









    Out[27]:





Survived
0    30.626179
1    28.343690
Name: Age, dtype: float64



In [28]:

    
std_age_by_survived = df.groupby('Survived')['Age'].std()
std_age_by_survived









    Out[28]:





Survived
0    14.172110
1    14.950952
Name: Age, dtype: float64

Merge



In [29]:

    
df1 = mean_age_by_survived.round(0).reset_index()
df2 = std_age_by_survived.round(0).reset_index()



In [30]:

    
df1



In [31]:

    
df2



In [32]:

    
df3 = pd.merge(df1, df2, on = 'Survived')



In [33]:

    
df3



In [34]:

    
# Giving names to the columns
df3.columns = ['Survived', 'Average Age', 'Age Standard Deviation']



In [35]:

    
df3









    Out[35]:







  
    
      
      Survived
      Average Age
      Age Standard Deviation
    
  
  
    
      0
      0
      31.0
      14.0
    
    
      1
      1
      28.0
      15.0

Pivot Tables



In [36]:

    
# Creating a pivot table
df.pivot_table(index = 'Pclass',
               columns = 'Survived',
               values = 'PassengerId',
               aggfunc = 'count')

Correlations



In [37]:

    
df['IsFemale'] = (df['Sex'] == 'female')



In [38]:

    
df['IsFemale'].head()









    Out[38]:





0    False
1     True
2     True
3     True
4    False
Name: IsFemale, dtype: bool



In [39]:

    
correlated_with_survived = df.corr()['Survived'].sort_values()
correlated_with_survived









    Out[39]:





Pclass        -0.338481
Age           -0.077221
SibSp         -0.035322
PassengerId   -0.005007
Parch          0.081629
Fare           0.257307
IsFemale       0.543351
Survived       1.000000
Name: Survived, dtype: float64



In [40]:

    
%matplotlib inline



In [41]:

    
# Plotting correlation with column 'Survived'
# Last column is omitted, because correlation between column 'Survived' and column 'Survived' is 1.
correlated_with_survived.iloc[:-1].plot(kind='bar',
                                        title='Titanic Passengers: correlation with survival')









    Out[41]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f61513a2710>



In [42]:

    
df.corr()









    Out[42]:







  
    
      
      PassengerId
      Survived
      Pclass
      Age
      SibSp
      Parch
      Fare
      IsFemale
    
  
  
    
      PassengerId
      1.000000
      -0.005007
      -0.035144
      0.036847
      -0.057527
      -0.001652
      0.012658
      -0.042939
    
    
      Survived
      -0.005007
      1.000000
      -0.338481
      -0.077221
      -0.035322
      0.081629
      0.257307
      0.543351
    
    
      Pclass
      -0.035144
      -0.338481
      1.000000
      -0.369226
      0.083081
      0.018443
      -0.549500
      -0.131900
    
    
      Age
      0.036847
      -0.077221
      -0.369226
      1.000000
      -0.308247
      -0.189119
      0.096067
      -0.093254
    
    
      SibSp
      -0.057527
      -0.035322
      0.083081
      -0.308247
      1.000000
      0.414838
      0.159651
      0.114631
    
    
      Parch
      -0.001652
      0.081629
      0.018443
      -0.189119
      0.414838
      1.000000
      0.216225
      0.245489
    
    
      Fare
      0.012658
      0.257307
      -0.549500
      0.096067
      0.159651
      0.216225
      1.000000
      0.182333
    
    
      IsFemale
      -0.042939
      0.543351
      -0.131900
      -0.093254
      0.114631
      0.245489
      0.182333
      1.000000

Visual Data Exploration with Matplotlib



In [43]:

    
data1 = np.random.normal(0, 0.1, 1000)
data2 = np.random.normal(1, 0.4, 1000) + np.linspace(0, 1, 1000)
data3 = 2 + np.random.random(1000) * np.linspace(1, 5, 1000)
data4 = np.random.normal(3, 0.2, 1000) + 0.3 * np.sin(np.linspace(0, 20, 1000))



In [44]:

    
# Stacking and transposing is equal to stacking columns
data = np.vstack([data1, data2, data3, data4]).transpose()



In [45]:

    
df = pd.DataFrame(data, columns = ['data1', 'data2', 'data3', 'data4'])
df.head()

Line Plot



In [46]:

    
df.plot(title='Line plot', figsize = (7, 7),)









    Out[46]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f61512e2588>



In [47]:

    
# Adding title and legend
plt.plot(df)
plt.title('Line plot')
plt.legend(['data1', 'data2', 'data3', 'data4'])









    Out[47]:





<matplotlib.legend.Legend at 0x7f614c207e10>

Scatter Plot



In [48]:

    
df.plot(style = '.', 
       figsize = (7, 7),)









    Out[48]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f614c1d36a0>



In [49]:

    
df.plot(kind = 'scatter',
        figsize = (7, 7),
        x = 'data1',
        y = 'data2',
        xlim = (-1.5, 1.5), 
        ylim = (0, 3))









    Out[49]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f614c15c978>

Histograms



In [50]:

    
df.plot(kind = 'hist',
        figsize = (7, 7),
        bins = 50,
        title = 'Histogram',
        alpha = 0.6)









    Out[50]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f614c0debe0>

Cumulative distribution



In [51]:

    
df.plot(kind = 'hist',
        figsize = (7, 7),
        bins = 100,
        title = 'Cumulative distributions',
        normed = True,
        cumulative = True,
        alpha = 0.4)









    Out[51]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f6147fc1c18>

Box Plot



In [52]:

    
df.plot(kind = 'box',
        figsize = (7, 7),
        title = 'Boxplot')









    Out[52]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f6147cf0e80>

Subplots



In [53]:

    
fig, ax = plt.subplots(2, 2, figsize=(10, 10))

df.plot(ax=ax[0][0],
        title='Line plot')

df.plot(ax=ax[0][1],
        style='o',
        title='Scatter plot')

df.plot(ax=ax[1][0],
        kind='hist',
        bins=50,
        title='Histogram')

df.plot(ax=ax[1][1],
        kind='box',
        title='Boxplot')

plt.tight_layout()

Pie charts



In [54]:

    
gt01 = df['data1'] > 0.1
piecounts = gt01.value_counts()
piecounts









    Out[54]:





False    851
True     149
Name: data1, dtype: int64



In [55]:

    
# Plotting a pie chart
# explode : how far each piece of the pie is far from the centre
# autopct : formatting printed percentage
piecounts.plot(kind = 'pie',
               figsize = (7, 7),
               explode = [0.0, 0.15],
               labels = ['<= 0.1', '> 0.1'],
               #colors = ['#191970', '#001CF0'],
               autopct = '%1.2f%%',
               shadow = True,
               startangle = 90,
               fontsize = 16)
plt.legend(loc = "best")









    Out[55]:





<matplotlib.legend.Legend at 0x7f6147a56ef0>

Hexbin plot



In [56]:

    
data = np.vstack([np.random.normal((0, 0), 2, size = (1000, 2)),
                  np.random.normal((9, 9), 3, size = (2000, 2))])
df = pd.DataFrame(data, columns = ['x', 'y'])



In [57]:

    
df.head()



In [58]:

    
df.plot()









    Out[58]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f6147383668>



In [59]:

    
df.plot(kind = 'kde')









    Out[59]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f6146d83198>



In [60]:

    
df.plot(kind='hexbin', 
        x = 'x', 
        y = 'y', 
        bins = 100, 
        cmap = 'rainbow')









    Out[60]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f61471b1c88>

Unstructured data

Images



In [61]:

    
from PIL import Image



In [62]:

    
img = Image.open('./data/iss.jpg')
img









    Out[62]:



In [63]:

    
type(img)









    Out[63]:





PIL.JpegImagePlugin.JpegImageFile



In [64]:

    
imgarray = np.asarray(img)



In [65]:

    
type(imgarray)









    Out[65]:





numpy.ndarray



In [66]:

    
# 3 channels, each 425 by 640 pixels
imgarray.shape









    Out[66]:





(435, 640, 3)



In [67]:

    
imgarray.ravel().shape









    Out[67]:





(835200,)



In [68]:

    
435 * 640 * 3









    Out[68]:





835200

Sound



In [69]:

    
from scipy.io import wavfile



In [70]:

    
rate, sound = wavfile.read(filename='./data/sms.wav')



In [71]:

    
from IPython.display import Audio



In [72]:

    
Audio(data = sound, rate = rate)









    Out[72]:



In [73]:

    
Audio(data = sound, rate = 0.5 * rate)









    Out[73]:



In [74]:

    
len(sound)









    Out[74]:





110250



In [75]:

    
sound









    Out[75]:





array([70, 14, 27, ..., 58, 68, 59], dtype=int16)



In [76]:

    
plt.plot(sound)









    Out[76]:





[<matplotlib.lines.Line2D at 0x7f613a3666a0>]



In [77]:

    
plt.specgram(sound, NFFT=1024, Fs=44100)
plt.ylabel('Frequency (Hz)')
plt.xlabel('Time (s)')









    Out[77]:





<matplotlib.text.Text at 0x7f613b03dcc0>

Data Exploration Exercises

Exercise 1

load the dataset: ../data/international-airline-passengers.csv
inspect it using the .info() and .head() commands
use the function pd.to_datetime() to change the column type of 'Month' to a datatime type
set the index of df to be a datetime index using the column 'Month' and the df.set_index() method
choose the appropriate plot and display the data
choose appropriate scale
label the axes



In [78]:

    
ex1 = pd.read_csv('./data/international-airline-passengers.csv')



In [79]:

    
ex1.head()









    Out[79]:







  
    
      
      Month
      Thousand Passengers
    
  
  
    
      0
      1949-01
      112
    
    
      1
      1949-02
      118
    
    
      2
      1949-03
      132
    
    
      3
      1949-04
      129
    
    
      4
      1949-05
      121



In [80]:

    
ex1.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144 entries, 0 to 143
Data columns (total 2 columns):
Month                  144 non-null object
Thousand Passengers    144 non-null int64
dtypes: int64(1), object(1)
memory usage: 2.3+ KB



In [81]:

    
ex1['Month'] = pd.to_datetime(ex1['Month'])



In [82]:

    
ex1 = ex1.set_index('Month')
ex1.head()









    Out[82]:







  
    
      
      Thousand Passengers
    
    
      Month
      
    
  
  
    
      1949-01-01
      112
    
    
      1949-02-01
      118
    
    
      1949-03-01
      132
    
    
      1949-04-01
      129
    
    
      1949-05-01
      121



In [83]:

    
ex1.plot()









    Out[83]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f613a815898>

Exercise 2

load the dataset: ../data/weight-height.csv
inspect it
plot it using a scatter plot with Weight as a function of Height
plot the male and female populations with 2 different colors on a new scatter plot
remember to label the axes



In [84]:

    
ex2 = pd.read_csv('./data/weight-height.csv')



In [85]:

    
ex2.head()



In [86]:

    
ex2.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
Gender    10000 non-null object
Height    10000 non-null float64
Weight    10000 non-null float64
dtypes: float64(2), object(1)
memory usage: 234.5+ KB



In [87]:

    
ex2.describe()









    Out[87]:







  
    
      
      Height
      Weight
    
  
  
    
      count
      10000.000000
      10000.000000
    
    
      mean
      66.367560
      161.440357
    
    
      std
      3.847528
      32.108439
    
    
      min
      54.263133
      64.700127
    
    
      25%
      63.505620
      135.818051
    
    
      50%
      66.318070
      161.212928
    
    
      75%
      69.174262
      187.169525
    
    
      max
      78.998742
      269.989699



In [88]:

    
ex2.plot(kind = 'scatter',
         x = 'Height',
         y = 'Weight')









    Out[88]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f613a574438>



In [89]:

    
ex2_males = ex2[ex2['Gender'] == 'Male']
ex2_females = ex2[ex2['Gender'] == 'Female']



In [90]:

    
fig, ax = plt.subplots(figsize = (10, 10))
ex2_males.plot(kind = 'scatter',
               x = 'Height',
               y = 'Weight', 
               ax = ax, 
               color = 'blue', 
               alpha = 0.2)
ex2_females.plot(kind = 'scatter',
               x = 'Height',
               y = 'Weight', 
               ax = ax, 
               color = 'red',
               alpha = 0.2)
plt.title('Male')









    Out[90]:





<matplotlib.text.Text at 0x7f613a40e8d0>

Exercise 3

plot the histogram of the heights for males and for females on the same plot
use alpha to control transparency in the plot comand
plot a vertical line at the mean of each population using plt.axvline()



In [91]:

    
ex3_males = ex2_males
ex3_females = ex2_females
fig, ax = plt.subplots(figsize = (10, 10))
ex3_males['Height'].plot(kind = 'hist',
                         bins = 30, 
                         color = 'blue', 
                         alpha = 0.3)
ex3_females['Height'].plot(kind = 'hist',
                           bins = 30, 
                           color = 'red',
                           alpha = 0.3)
plt.axvline(ex3_males['Height'].mean(), 
            linewidth = 3,
            linestyle='dashed',
            color = 'blue')
plt.axvline(ex3_females['Height'].mean(),
            linewidth = 3,
            linestyle='dashed',
            color = 'red')
plt.legend()









    Out[91]:





<matplotlib.legend.Legend at 0x7f613a3454e0>

Exercise 4

plot the weights of the males and females using a box plot
which one is easier to read?
(remember to put in titles, axes and legends)



In [92]:

    
ex4 = ex2
ex4.head()



In [93]:

    
ex4_pivot = ex4.pivot(columns = 'Gender',
                      values = 'Weight')



In [94]:

    
ex4_pivot.head()



In [95]:

    
ex4_pivot.tail()



In [96]:

    
ex4_pivot.plot(figsize = (10, 10),
               kind = 'box',
               yticks = [120, 150, 200, 250])
plt.grid()
plt.show()

Exercise 5

load the dataset: ../data/titanic-train.csv
learn about scattermatrix here: http://pandas.pydata.org/pandas-docs/stable/visualization.html
display the data using a scattermatrix



In [97]:

    
ex5 = pd.read_csv('./data/titanic-train.csv')
ex5.head()









    Out[97]:







  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      0
      1
      0
      3
      Braund, Mr. Owen Harris
      male
      22.0
      1
      0
      A/5 21171
      7.2500
      NaN
      S
    
    
      1
      2
      1
      1
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      female
      38.0
      1
      0
      PC 17599
      71.2833
      C85
      C
    
    
      2
      3
      1
      3
      Heikkinen, Miss. Laina
      female
      26.0
      0
      0
      STON/O2. 3101282
      7.9250
      NaN
      S
    
    
      3
      4
      1
      1
      Futrelle, Mrs. Jacques Heath (Lily May Peel)
      female
      35.0
      1
      0
      113803
      53.1000
      C123
      S
    
    
      4
      5
      0
      3
      Allen, Mr. William Henry
      male
      35.0
      0
      0
      373450
      8.0500
      NaN
      S



In [98]:

    
# Dropping passenger Id
ex5 = ex5.drop('PassengerId', axis = 1)



In [99]:

    
from pandas.plotting import scatter_matrix



In [100]:

    
_ = scatter_matrix(ex5, figsize = (15, 15))

Exploring more info about the data



In [101]:

    
ex5['Age'].plot.kde(figsize = (7, 7))









    Out[101]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f6139cc1d30>



In [102]:

    
ex5['Fare'].plot.kde(figsize = (7, 7))









    Out[102]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f6139bcb780>



In [103]:

    
ex5['SibSp'].plot.kde(figsize = (7, 7))









    Out[103]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f6139cd02e8>

	data1	data2	data3	data4
0	0.170847	0.605400	2.432583	3.103751
1	0.090003	0.583349	2.397237	2.815148
2	0.158135	1.094243	2.706464	2.989091
3	-0.056954	0.054362	2.182445	2.953376
4	0.010518	1.100732	2.308244	3.072519

	x	y
0	-1.389318	-3.243604
1	4.715768	-2.839096
2	2.089588	0.514746
3	-2.882980	0.340440
4	1.842628	3.096558

	Gender	Height	Weight
0	Male	73.847017	241.893563
1	Male	68.781904	162.310473
2	Male	74.110105	212.740856
3	Male	71.730978	220.042470
4	Male	69.881796	206.349801

	Gender	Height	Weight
0	Male	73.847017	241.893563
1	Male	68.781904	162.310473
2	Male	74.110105	212.740856
3	Male	71.730978	220.042470
4	Male	69.881796	206.349801

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

	PassengerId	Survived	Pclass	Age	SibSp	Parch	Fare
count	891.000000	891.000000	891.000000	714.000000	891.000000	891.000000	891.000000
mean	446.000000	0.383838	2.308642	29.699118	0.523008	0.381594	32.204208
std	257.353842	0.486592	0.836071	14.526497	1.102743	0.806057	49.693429
min	1.000000	0.000000	1.000000	0.420000	0.000000	0.000000	0.000000
25%	223.500000	0.000000	2.000000	20.125000	0.000000	0.000000	7.910400
50%	446.000000	0.000000	3.000000	28.000000	0.000000	0.000000	14.454200
75%	668.500000	1.000000	3.000000	38.000000	1.000000	0.000000	31.000000
max	891.000000	1.000000	3.000000	80.000000	8.000000	6.000000	512.329200

	PassengerId	Survived	Pclass	Name	Sex	Age	Ticket	Fare	Cabin	Embarked
96	97	0	1	Goldschmidt, Mr. George B	male	71.0	PC 17754	34.6542	A5	C
116	117	0	3	Connors, Mr. Patrick	male	70.5	370369	7.7500	NaN	Q
493	494	0	1	Artagaveytia, Mr. Ramon	male	71.0	PC 17609	49.5042	NaN	C
630	631	1	1	Barkworth, Mr. Algernon Henry Wilson	male	80.0	27042	30.0000	A23	S
851	852	0	3	Svensson, Mr. Johan	male	74.0	347060	7.7750	NaN	S

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
59	60	0	3	Goodwin, Master. William Frederick	male	11.0	5	2	CA 2144	46.9000	NaN	S
71	72	0	3	Goodwin, Miss. Lillian Amy	female	16.0	5	2	CA 2144	46.9000	NaN	S
386	387	0	3	Goodwin, Master. Sidney Leonard	male	1.0	5	2	CA 2144	46.9000	NaN	S
480	481	0	3	Goodwin, Master. Harold Victor	male	9.0	5	2	CA 2144	46.9000	NaN	S
542	543	0	3	Andersson, Miss. Sigrid Elisabeth	female	11.0	4	2	347082	31.2750	NaN	S
683	684	0	3	Goodwin, Mr. Charles Edward	male	14.0	5	2	CA 2144	46.9000	NaN	S
731	732	0	3	Hassan, Mr. Houssein G N	male	11.0	0	0	2699	18.7875	NaN	C
802	803	1	1	Carter, Master. William Thornton II	male	11.0	1	2	113760	120.0000	B96 B98	S

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
803	804	1	3	Thomas, Master. Assad Alexander	male	0.42	0	1	2625	8.5167	NaN	C
755	756	1	2	Hamalainen, Master. Viljo	male	0.67	1	1	250649	14.5000	NaN	S
644	645	1	3	Baclini, Miss. Eugenie	female	0.75	2	1	2666	19.2583	NaN	C
469	470	1	3	Baclini, Miss. Helene Barbara	female	0.75	2	1	2666	19.2583	NaN	C
78	79	1	2	Caldwell, Master. Alden Gates	male	0.83	0	2	248738	29.0000	NaN	S

	PassengerId	Survived	Pclass	Age	SibSp	Parch	Fare	IsFemale
PassengerId	1.000000	-0.005007	-0.035144	0.036847	-0.057527	-0.001652	0.012658	-0.042939
Survived	-0.005007	1.000000	-0.338481	-0.077221	-0.035322	0.081629	0.257307	0.543351
Pclass	-0.035144	-0.338481	1.000000	-0.369226	0.083081	0.018443	-0.549500	-0.131900
Age	0.036847	-0.077221	-0.369226	1.000000	-0.308247	-0.189119	0.096067	-0.093254
SibSp	-0.057527	-0.035322	0.083081	-0.308247	1.000000	0.414838	0.159651	0.114631
Parch	-0.001652	0.081629	0.018443	-0.189119	0.414838	1.000000	0.216225	0.245489
Fare	0.012658	0.257307	-0.549500	0.096067	0.159651	0.216225	1.000000	0.182333
IsFemale	-0.042939	0.543351	-0.131900	-0.093254	0.114631	0.245489	0.182333	1.000000

	Thousand Passengers
Month
1949-01-01	112
1949-02-01	118
1949-03-01	132
1949-04-01	129
1949-05-01	121

	Height	Weight
count	10000.000000	10000.000000
mean	66.367560	161.440357
std	3.847528	32.108439
min	54.263133	64.700127
25%	63.505620	135.818051
50%	66.318070	161.212928
75%	69.174262	187.169525
max	78.998742	269.989699

Gender	Female	Male
9995	136.777454	NaN
9996	170.867906	NaN
9997	128.475319	NaN
9998	163.852461	NaN
9999	113.649103	NaN